This analysis explores a wheat dataset containing yield information across different countries, locations, and time periods. The primary objectives are:
library(data.table) # For fast file reading
library(tidyverse) # Data manipulation
library(knitr) # Tables
library(ggplot2) # Static plots only
library(scales) # For formatting
library(leaflet)
library(gt)
library(kableExtra)
library(plotly) # Add this to your libraries section
# Parallel processing
library(furrr) # Parallel mapping functions
# Load wheat data with irrigation and rainfall calendar information
# Note: fileEncoding handles special characters in country/location names
data <- read.csv("../../../data/Data/wheat_data_with_calendar.csv", fileEncoding = "latin1")
# Initial data inspection
cat("Dataset dimensions:", dim(data), "\n")
## Dataset dimensions: 80382 86
cat("Number of countries:", length(unique(data$Country)), "\n")
## Number of countries: 42
cat("Time period covered:", range(data$Observation.period, na.rm = TRUE), "\n")
## Time period covered: 2022
# Display all column names for reference
names(data)
## [1] "id"
## [2] "Data.ID"
## [3] "Location"
## [4] "State.Region.County.Province"
## [5] "Country"
## [6] "Continent"
## [7] "Latitude..N.S."
## [8] "Longitude..E.W."
## [9] "Conversion.for.latitude"
## [10] "Conversion.for.longitude"
## [11] "Location.source"
## [12] "Observation.period"
## [13] "Wheat.Type"
## [14] "Crop.variety"
## [15] "Tillage.type"
## [16] "Planting.date"
## [17] "Harvesting.date"
## [18] "Flowering.stage"
## [19] "Treatment"
## [20] "Treatment.type"
## [21] "Grain.yield..tons.ha.1."
## [22] "SE...22"
## [23] "Climate"
## [24] "Mean.annual.temperature..Â.C."
## [25] "Water.regime"
## [26] "Mean.annual.precipitation..mm."
## [27] "Irrigation..mm."
## [28] "Soil.type"
## [29] "Soil.depth..cm."
## [30] "Sand...."
## [31] "Silt...."
## [32] "Clay...."
## [33] "Soil.texture"
## [34] "Soil.organic.carbon..g.C.kg.1."
## [35] "Soil.organic.carbon...."
## [36] "Category"
## [37] "TN..g.N.kg.1."
## [38] "Category.1"
## [39] "C.N.ratio"
## [40] "Category.2"
## [41] "Soil.pH"
## [42] "Category.3"
## [43] "BD..g.cm.3."
## [44] "Category.4"
## [45] "Replicates"
## [46] "N.type"
## [47] "Category.5"
## [48] "N.rate..kg.N.ha.1."
## [49] "N.fertilizer.management"
## [50] "P.type"
## [51] "P.rate..kg.P.ha.1."
## [52] "Straw.return"
## [53] "Plastic.film.mulching"
## [54] "Emissions..yes.no."
## [55] "Cumulative.N2O.fluxes..kg.N.ha.1."
## [56] "SE...56"
## [57] "SD...57"
## [58] "Yield.scaled.N2O.emission..g.N.Mg.1."
## [59] "SE...59"
## [60] "SD...60"
## [61] "PFPN..kg.kg..1."
## [62] "ANE..kg.kg.1."
## [63] "EFd...."
## [64] "Pest.prescence....64"
## [65] "Pest.detected...65"
## [66] "Pest.severity.score.......66"
## [67] "Pest.abundance...67"
## [68] "Pest.prescence....68"
## [69] "Pest.detected...69"
## [70] "Pest.severity.score.......70"
## [71] "Pest.abundance...71"
## [72] "Pest.prescence....72"
## [73] "Main.weed.detected"
## [74] "Total.weed.coverage"
## [75] "Main.weed.abundance.of.total.coverage"
## [76] "X"
## [77] "Y"
## [78] "unit_code"
## [79] "rainfed_start_date"
## [80] "rainfed_end_date"
## [81] "irr_start_date"
## [82] "irr_end_date"
## [83] "start_date"
## [84] "end_date"
## [85] "Planting.date.1"
## [86] "Harvesting.date.1"
# Select key variables and rename for easier handling
yield <- data %>%
select(Country,
Observation.period,
Location,
Continent,
State.Region.County.Province,
longitude = Conversion.for.longitude, # Rename for clarity
latitude = Conversion.for.latitude, # Rename for clarity
yield_value = Grain.yield..tons.ha.1.) %>% # Main outcome variable
mutate(
# Convert character columns to appropriate numeric types
longitude = as.numeric(longitude),
latitude = as.numeric(latitude),
yield_value = as.numeric(yield_value),
Observation.period = as.numeric(Observation.period)
) %>%
filter(
# Remove problematic observations
!is.na(yield_value), # Remove missing yield values
!is.na(longitude), # Remove missing coordinates
!is.na(latitude),
yield_value <= 30, # Remove unrealistic yield outliers (>30 tons/ha)
nchar(as.character(Observation.period)) == 4 # Keep only 4-digit years
)
cat("Cleaned dataset dimensions:", dim(yield), "\n")
## Cleaned dataset dimensions: 79710 8
cat("Yield range:", range(yield$yield_value), "tons/ha\n")
## Yield range: 0.01673634 28.5 tons/ha
# Summary statistics for the cleaned dataset
summary(yield)
## Country Observation.period Location Continent
## Length:79710 Min. :1951 Length:79710 Length:79710
## Class :character 1st Qu.:2014 Class :character Class :character
## Mode :character Median :2017 Mode :character Mode :character
## Mean :2013
## 3rd Qu.:2018
## Max. :2022
## State.Region.County.Province longitude latitude
## Length:79710 Min. :-121.926 Min. :-37.82
## Class :character 1st Qu.:-120.110 1st Qu.: 27.33
## Mode :character Median :-109.090 Median : 36.34
## Mean : -70.241 Mean : 32.11
## 3rd Qu.: -0.373 3rd Qu.: 38.54
## Max. : 152.390 Max. : 60.70
## yield_value
## Min. : 0.01674
## 1st Qu.: 3.85785
## Median : 5.43659
## Mean : 5.38656
## 3rd Qu.: 6.88948
## Max. :28.50000
# Create color palette for yield values
pal <- colorNumeric(palette = "YlOrRd", domain = yield$yield_value)
# Create interactive map showing global wheat yield distribution
leaflet(yield) %>%
addTiles() %>% # Add base map tiles
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
color = ~pal(yield_value), # Color by yield value
popup = ~paste("Country:", Country, "<br>",
"Location:", Location, "<br>",
"Yield:", round(yield_value, 2), "tons/ha"),
radius = 3,
fillOpacity = 0.7
) %>%
addLegend("bottomright",
pal = pal,
values = ~yield_value,
title = "Yield (tons/ha)")